Comparación de Modelos

Importación de Datos

In [ ]:
pip install pmdarima
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Requirement already satisfied: pmdarima in /usr/local/lib/python3.7/dist-packages (2.0.1)
Requirement already satisfied: statsmodels>=0.13.2 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (0.13.5)
Requirement already satisfied: Cython!=0.29.18,!=0.29.31,>=0.29 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (0.29.32)
Requirement already satisfied: numpy>=1.21 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (1.21.6)
Requirement already satisfied: setuptools!=50.0.0,>=38.6.0 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (57.4.0)
Requirement already satisfied: urllib3 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (1.24.3)
Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (1.2.0)
Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (1.7.3)
Requirement already satisfied: pandas>=0.19 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (1.3.5)
Requirement already satisfied: scikit-learn>=0.22 in /usr/local/lib/python3.7/dist-packages (from pmdarima) (1.0.2)
Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.19->pmdarima) (2022.6)
Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.19->pmdarima) (2.8.2)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.7/dist-packages (from python-dateutil>=2.7.3->pandas>=0.19->pmdarima) (1.15.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from scikit-learn>=0.22->pmdarima) (3.1.0)
Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.7/dist-packages (from statsmodels>=0.13.2->pmdarima) (0.5.3)
Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.7/dist-packages (from statsmodels>=0.13.2->pmdarima) (21.3)
Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=21.3->statsmodels>=0.13.2->pmdarima) (3.0.9)
In [ ]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

import sklearn as sk
from sklearn import impute
from sklearn import preprocessing
import sklearn.externals
import joblib
from sklearn.model_selection import TimeSeriesSplit
from sklearn.impute import KNNImputer
import sklearn.preprocessing

from keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
import time
import plotly.graph_objects as go
from sklearn import metrics

import statsmodels.api as sm
import statsmodels.tsa.stattools as ts
from statsmodels.tsa.statespace.sarimax import SARIMAX

import tensorflow as tf
import tensorflow.keras.layers as L
import tensorflow.keras.models as M
import tensorflow.keras.backend as K

%matplotlib inline
In [ ]:
import plotly.io as pio
pio.renderers.default='notebook'
In [ ]:
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
In [ ]:
df = pd.read_csv("/content/drive/Shareddrives/Mineria /Temperatura1.csv", sep=';', header=0, decimal = ',')
Fecha = pd.date_range(start='2017-01-01', end='2021-12-31', freq='D')
df['Fecha'] = Fecha
df = df.set_index('Fecha')

print(df[pd.isnull(df.ValorObservado)])
print('En total hay' ,
      str(df['ValorObservado'].isnull().sum()) ,
      'valores sin información')
print('Correspondientes al {:.3f}% del total'
      .format(df['ValorObservado'].isnull().sum()*100/len(df)))
            ValorObservado
Fecha                     
2017-08-12             NaN
2017-12-24             NaN
2019-09-15             NaN
2019-09-16             NaN
2019-09-17             NaN
2020-11-12             NaN
2021-01-05             NaN
2021-01-06             NaN
2021-01-07             NaN
2021-01-08             NaN
2021-08-18             NaN
2021-08-20             NaN
2021-12-05             NaN
En total hay 13 valores sin información
Correspondientes al 0.712% del total

La serie presenta valores faltantes, por lo tantol se imputaran usando el método de vecinos más cercanos (KNN), como se muestra a continuación.

Imputación a partir del vecino más cercano

In [ ]:
#Imputación de Valores usando el vecino más cercano
imput = KNNImputer(n_neighbors=5, weights="uniform")

# Ajustamos el modelo e imputamos los missing values
imput.fit(df[['ValorObservado']])
df['ValorObservado'] = imput.transform(df[['ValorObservado']]).ravel()
print()
print("Valores pérdidos en ValorObservado: " , 
      str(df['ValorObservado'].isnull().sum()))
Valores pérdidos en ValorObservado:  0
In [ ]:
fig = px.line(df, x=df.index, y="ValorObservado")
fig.update_xaxes(title_text="Fecha")
fig.show()

Separación de datos entrenamiento y validación

Para el respectivo análisis se tomarán el 80% de los datos para entrenamiento y validacion, el 20% restantes para prueba, dichos valores corresponden a 1460 y 366 respectivamente.

In [ ]:
from sklearn.preprocessing import MinMaxScaler
# crea el objeto  scaler y escala los datos
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(df.values)
#
df_norm = pd.DataFrame(scaled_data,index=df.index, columns=['ValorObservadoNormalizado'])
In [ ]:
def create_dataset(X, y, time_steps=1):
    # crea dos listas vacias para depositar los datos
    Xs, ys = [], []
    # el primer lote de datos empieza en la primera observación
    # y toma time_steps  datos.
    # Comienza a avanzar hacia adelante.
    for i in range(len(X) - time_steps):
        v = X.iloc[i:(i + time_steps)].values
        Xs.append(v)
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)
In [ ]:
train_size = int(len(df_norm) * 0.8)
test_size = len(df_norm) - train_size
train, test = df_norm.iloc[0:train_size], df_norm.iloc[train_size:len(df_norm)]
len_train = len(train)
len_test = len(test)
print(len_train, len_test)
1460 366
In [ ]:
time_steps = 50

# reshape to [samples, time_steps, n_features]

X_train, y_train = create_dataset(train, train, time_steps)
X_test, y_test = create_dataset(test, test, time_steps)

print("X_train.shape = ", X_train.shape)
print("y_train.shape = ", y_train.shape)
print("X_test.shape = ", X_test.shape)
print("y_test.shape = ", y_test.shape)
X_train.shape =  (1410, 50, 1)
y_train.shape =  (1410, 1)
X_test.shape =  (316, 50, 1)
y_test.shape =  (316, 1)
In [ ]:
fig = px.line(df_norm, x=df.index, y='ValorObservadoNormalizado')
fig.update_xaxes(title_text="Fecha")
fig.update_yaxes(title_text="ValorObservadoNormalizado")
fig.show()

Modelos

In [ ]:
#ARIMA
ARIMA_model = joblib.load('/content/drive/Shareddrives/Mineria /ARIMA_Predict')
ARIMA_final= sm.tsa.statespace.SARIMAX(df[:1460], trend='n', order=(3, 1, 3))  
results_final= ARIMA_final.fit(use_boxcox=True)

#SRNN
SRNN_model = joblib.load('/content/drive/Shareddrives/Mineria /SRNN')

#GRU
GRU_model = joblib.load('/content/drive/Shareddrives/Mineria /GRU')

#LSTM
LSTM_model = joblib.load('/content/drive/Shareddrives/Mineria /LSTM')
/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/base/tsa_model.py:471: ValueWarning:

No frequency information was provided, so inferred frequency D will be used.

/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/base/tsa_model.py:471: ValueWarning:

No frequency information was provided, so inferred frequency D will be used.

/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning:

Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.

/usr/local/lib/python3.7/dist-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning:

Non-invertible starting MA parameters found. Using zeros as starting parameters.

/usr/local/lib/python3.7/dist-packages/statsmodels/base/optimizer.py:23: FutureWarning:

Keyword arguments have been passed to the optimizer that have no effect. The list of allowed keyword arguments for method lbfgs is: m, pgtol, factr, maxfun, epsilon, approx_grad, bounds, loglike_and_score. The list of unsupported keyword arguments passed include: use_boxcox. After release 0.14, this will raise.

/usr/local/lib/python3.7/dist-packages/statsmodels/base/model.py:606: ConvergenceWarning:

Maximum Likelihood optimization failed to converge. Check mle_retvals

Conjunto de Entrenamiento

In [ ]:
#ARIMA 
ARIMA_train = results_final.predict(start=1, end=len(train))

#SRNN
SRNN_train = SRNN_model.predict(X_train)
SRNN_train = scaler.inverse_transform(SRNN_train)

#GRU
GRU_train = GRU_model.predict(X_train)
GRU_train = scaler.inverse_transform(GRU_train)

#LSTM
LSTM_train = LSTM_model.predict(X_train)
LSTM_train = scaler.inverse_transform(LSTM_train)
45/45 [==============================] - 0s 5ms/step
45/45 [==============================] - 1s 7ms/step
45/45 [==============================] - 1s 14ms/step
In [ ]:
seq_len = 50
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=df.index[seq_len : len(y_train) + seq_len],
        y=scaler.inverse_transform(y_train).ravel(),
        mode="lines",
        name="Entrenamiento",
    )
)
fig.add_trace(
    go.Scatter(
        x=df.index[seq_len : len(y_train) + seq_len],
        y=ARIMA_train[45:].ravel(),
        mode="lines",
        name="ARIMA",
    )
)
fig.add_trace(
    go.Scatter(
        x=df.index[seq_len : len(y_train) + seq_len],
        y=SRNN_train.ravel(),
        mode="lines",
        name="SRNN",
    )
)
fig.add_trace(
    go.Scatter(
        x=df.index[seq_len : len(y_train) + seq_len],
        y=GRU_train.ravel(),
        mode="lines",
        name="GRU",
    )
)
fig.add_trace(
    go.Scatter(
        x=df.index[seq_len : len(y_train) + seq_len],
        y=LSTM_train.ravel(),
        mode="lines",
        name="LSTM",
    )
)
fig.update_xaxes(title_text="Fecha")
fig.update_yaxes(title_text="ValorObservado")
fig.show()

Conjunto de Prueba

In [ ]:
#ARIMA
ARIMA_test= ARIMA_model['ARIMA_Predict']

#SRNN
SRNN_test = SRNN_model.predict(X_test)
SRNN_test = scaler.inverse_transform(SRNN_test)

#GRU
GRU_test = GRU_model.predict(X_test)
GRU_test = scaler.inverse_transform(GRU_test)

#LSTM
LSTM_test = LSTM_model.predict(X_test)
LSTM_test = scaler.inverse_transform(LSTM_test)
10/10 [==============================] - 0s 8ms/step
10/10 [==============================] - 0s 12ms/step
10/10 [==============================] - 0s 26ms/step
In [ ]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=df.index[len(y_train) + seq_len :],
        y=scaler.inverse_transform(y_test).ravel(),
        mode="lines",
        name="Prueba",
    )
)
fig.add_trace(
    go.Scatter(
        x=df.index[len(y_train) + seq_len :],
        y=ARIMA_test[45:],
        mode="lines",
        name="ARIMA",
    )
)
fig.add_trace(
    go.Scatter(
        x=df.index[len(y_train) + seq_len :],
        y=SRNN_test.ravel(),
        mode="lines",
        name="SRNN",
    )
)
fig.add_trace(
    go.Scatter(
        x=df.index[len(y_train) + seq_len :],
        y=GRU_test.ravel(),
        mode="lines",
        name="GRU",
    )
)
fig.add_trace(
    go.Scatter(
        x=df.index[len(y_train) + seq_len :],
        y=LSTM_test.ravel(),
        mode="lines",
        name="LSTM",
    )
)
fig.update_xaxes(title_text="Fecha")
fig.update_yaxes(title_text="ValorObservado")
fig.show()
In [ ]:
fig = go.Figure()
fig.add_trace(
    go.Scatter(
        x=df.index[len(y_train) + seq_len :],
        y=scaler.inverse_transform(y_test).ravel()-ARIMA_test[45:],
        mode="lines",
        name="ARIMA",
    )
)
fig.add_trace(
    go.Scatter(
        x=df.index[len(y_train) + seq_len :],
        y=scaler.inverse_transform(y_test).ravel()-SRNN_test.ravel(),
        mode="lines",
        name="SRNN",
    )
)
fig.add_trace(
    go.Scatter(
        x=df.index[len(y_train) + seq_len :],
        y=scaler.inverse_transform(y_test).ravel()-GRU_test.ravel(),
        mode="lines",
        name="GRU",
    )
)
fig.add_trace(
    go.Scatter(
        x=df.index[len(y_train) + seq_len :],
        y=scaler.inverse_transform(y_test).ravel()-LSTM_test.ravel(),
        mode="lines",
        name="LSTM",
    )
)
fig.update_xaxes(title_text="Fecha")
fig.update_yaxes(title_text="Error")
fig.show()

Error Cuadrático Medio

Modelo SARIMA SRNN LSTM GRU
ECM (1 paso adelante) 0.68 0.72305 0.680276 0.835264
ECM (5 paso adelante) 0.84 1.555407 1.494980 1.525437
ECM (100 retardos) 0.695902 0.633088 0.649107